{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一个官方示例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\softwares\\python\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "# https://www.sbert.net/\n",
    "# 导入\n",
    "from sentence_transformers import SentenceTransformer, util"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加载模型\n",
    "# conda activate test\n",
    "# pip install -U huggingface_hub\n",
    "# $env:HF_ENDPOINT = \"https://hf-mirror.com\"\n",
    "# huggingface-cli download --resume-download sentence-transformers/all-MiniLM-L6-v2\n",
    "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
    "# model = SentenceTransformer(\"C:\\\\Users\\\\ji\\\\.cache\\\\huggingface\\\\hub\\\\models--sentence-transformers--all-MiniLM-L6-v2\\\\snapshots\\\\e4ce9877abf3edfe10b0d82785e83bdcb973e22e\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 要编码的句子\n",
    "sentences = [\n",
    "    \"Sentences are passed as a list of string.\",\n",
    "    \"Sentences are passed as a list of some strings.\",\n",
    "    \"The quick brown fox jumps over the lazy dog.\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 384)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ⭐ 编码，这个就是我们的核心API\n",
    "embeddings = model.encode(sentences)\n",
    "embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: Sentences are passed as a list of string.\n",
      "Embedding: 384 [ 0.05645248  0.05500241  0.03137959  0.03394852 -0.03542471  0.08346676\n",
      "  0.09888009  0.00727546 -0.00668661 -0.00765813]\n",
      "\n",
      "Sentence: Sentences are passed as a list of some strings.\n",
      "Embedding: 384 [ 0.04327982  0.05298091  0.04216757  0.02804001 -0.02177015  0.08530058\n",
      "  0.1099958  -0.00747292 -0.00966357 -0.01171357]\n",
      "\n",
      "Sentence: The quick brown fox jumps over the lazy dog.\n",
      "Embedding: 384 [ 0.0439335   0.05893444  0.04817839  0.07754812  0.02674436 -0.03762956\n",
      " -0.00260513 -0.05994308 -0.00249604  0.02207283]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 查看句子和它对应的编码\n",
    "for sentence, embedding in zip(sentences, embeddings):\n",
    "    print(\"Sentence:\", sentence)\n",
    "    print(\"Embedding:\", len(embedding), embedding[:10])\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[0.9861]])\n",
      "tensor([[0.1036]])\n"
     ]
    }
   ],
   "source": [
    "# 计算句子余弦相似度\n",
    "cos_sim_1 = util.cos_sim(embeddings[0], embeddings[1])\n",
    "cos_sim_2 = util.cos_sim(embeddings[0], embeddings[2])\n",
    "print(cos_sim_1)\n",
    "print(cos_sim_2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
